/* 
    Purpose: Using the 1940 full Census, this file locates (1) men aged 
             30-50 who are fathers of a child younger than 18 in the same 
             household and (2) women aged 30-50 who are mothers of a child 
             younger than 18 in the same household. Income measures for both
             parents are cleaned.

    Notes: (1) Incwage was asked of all individuals 14+ in the 1940 Census.
           (2) 1940 Census data can be found at the directory 
               /homes/data/cens1940/1940_2-0/100files/ on the 
               NBER server. 
    
    Creates: Census1940_full_raw_fathers30to50.dta
    		 Census1940_full_raw_mothers30to50.dta

*/

cd /*YOUR NBER SERVER DIRECTORY*/

/* 1940 Census is so large that it is split into 100 different files
   on the NBER server. Append the files together. */
	forval k=1(1)100 {
		use /homes/data/cens1940/1940_2-0/100files/us1940b_usa`k'.dta, clear //Directory on NBER server

		keep serial pernum occ1950 incwage age race region statefip poploc educ labforce famunit occscore classwkr momloc relate
		
		tempfile data`k'
		save `data`k''
	}
		
	use `data1', clear
	forval k=2(1)100 {
		append using `data`k''	
	}
	
	save Census1940_full.dta, replace
	
**----------------------------------------------**

************************
** FATHERS
************************

	use Census1940_full.dta, clear 
	
	tempfile fulldata
	save `fulldata'
	
* Identify fathers 
	keep if age<18 //Restrict to children younger than 18
	keep serial poploc age
	
	replace poploc=. if poploc==0 
	drop if poploc==. //Exclude children without a father in the house
	
	sort serial poploc
	by serial poploc: gen number_children = _N
	
	by serial poploc: keep if _n==1 //Keep all unique father ids. Some fathers will have multiple children in the Census. 
	rename poploc pernum
	drop age

	tempfile children 
	save `children'
	
	bysort serial: keep if _n==1
	drop pernum 
	
	tempfile children_hh
	save `children_hh'

* Identify and keep households with children
	use `fulldata', clear
	merge m:1 serial using `children_hh'
	keep if _merge==3
	drop _merge
	
* Bring in fathers and match them to children
	merge 1:1 serial pernum using `children'
	
	drop if _merge==2
	
	gen father = _merge==3 
	drop _merge
	
* Fix individual income
	gen incwage_og = incwage
	replace incwage=. if incwage>=999998
		
* Family and household income
	gen temp = incwage
	replace temp=0 if age<14  
	
	gen missing_inc = temp==.
	gen earner = temp>0 & temp<.
	
	sort serial famunit
	by serial famunit: egen fam_income = sum(temp)
	by serial famunit: egen number_earner_fam = sum(earner)
	by serial famunit: egen number_missing_inc_fam = sum(missing_inc)
	
	by serial: egen hh_income = sum(temp)
	by serial: egen number_earner_hh = sum(earner)
	by serial: egen number_missing_inc_hh = sum(missing_inc)
	
	sum hh_income if father==1 & age<=50 & age>=30 & incwage<., d

	drop temp* earner missing_inc
	
* Count number of children present
	gen child = age<18

	by serial: egen number_children_HH = sum(child)
	by serial famunit: egen number_children_fam = sum(child)
	
* Count number of people in household/family
	by serial: gen number_people_HH = _N
	by serial famunit: gen number_people_fam = _N
	
	gen universe = age>=14 //universe of adults asked about incwage
	by serial: egen number_adult_universe_HH = sum(universe)
	by serial famunit: egen number_adult_universe_fam = sum(universe)
	
	drop child universe

* Keep fathers ages 30 to 50
	keep if father==1
	keep if age>=30 & age<=50
	
	compress 
	save /*YOUR PATH*/Census1940_full_raw_fathers30to50.dta, replace
	* In this replication folder, we act as if we're saving this in "$Mydirectory1/1_DataSources/CensusData/input"

	
**----------------------------------------------**

************************
** MOTHERS
************************

	use Census1940_full.dta, clear 
	
	tempfile fulldata
	save `fulldata'	
	
* Identify mothers 
	keep if age<18
	keep serial momloc age
	
	replace momloc=. if momloc==0 
	drop if momloc==. //Exclude children without a mother in the house
	
	bysort serial momloc: keep if _n==1 //Keep all unique mother ids. Some mothers will have multiple children in the Census. 
	rename momloc pernum
	drop age

	tempfile children 
	save `children'
	
	bysort serial: keep if _n==1
	drop pernum 
	
	tempfile children_hh
	save `children_hh'

* Now identify and keep households with children
	use `fulldata', clear
	merge m:1 serial using `children_hh'
	keep if _merge==3
	drop _merge
	
* Now bring in mothers and match them to children
	merge 1:1 serial pernum using `children'
	
	drop if _merge==2
	
	gen mother = _merge==3 
	drop _merge
	
* Identify mothers who are heads of household and keep only those houses (main difference)
	gen temp1 = mother==1 & relate==101
	bysort serial: egen mother_HH = max(temp1)
	tab mother_HH
	
	keep if mother_HH==1
	
* Fix individual income
	gen incwage_og = incwage
	replace incwage=. if incwage>=999998
		
* Family and household income
	gen temp = incwage
	replace temp=0 if  age<14  
	
	gen missing_inc = temp==.
	gen earner = temp>0 & temp<.
	
	sort serial famunit
	by serial famunit: egen fam_income = sum(temp)
	by serial famunit: egen number_earner_fam = sum(earner)
	by serial famunit: egen number_missing_inc_fam = sum(missing_inc)
	
	by serial: egen hh_income = sum(temp)
	by serial: egen number_earner_hh = sum(earner)
	by serial: egen number_missing_inc_hh = sum(missing_inc)
	
	drop temp* earner missing_inc
	
* Number of people in household that could have income
	gen universe = age>=14 //universe of adults asked about incwage
	by serial: egen number_adult_universe_HH = sum(universe)
	by serial famunit: egen number_adult_universe_fam = sum(universe)
	
	drop universe

* Keep mothers ages 30 to 50
	keep if mother==1
	keep if age>=30 & age<=50
	
	compress 
	save /*YOUR PATH*/Census1940_full_raw_mothers30to50.dta, replace
	* In this replication folder, we act as if we're saving this in "$Mydirectory1/1_DataSources/CensusData/input"
